
import random
import json
from tqdm import tqdm
import torch
import base64
import requests
import re
import os
import openai




prompt = '''
You will receive an image and a question about the image. You will also be provided with the human needs stated in natural language. And you will receive two separate responses to the question.
Your job is to evaluate which response is more user-tailored and more customized to the user's needs.

Here's the question: {} 
Here are the human needs: {}

Here's the first response to the question: {}
And here's the second response to the question: {}
Please tell me which one is more user-tailored and give me answer in 'first' or 'second'. Do not include any other information in your response.
'''

with open('/home/user/llavafinetune/multiturn/quark_final_response.jsonl', 'r') as f:
    quark_data = [json.loads(line) for line in f]
    
with open('/home/user/llavafinetune/data/results_eval/llava_evaluated_gpt4.jsonl', 'r') as f:
    llava_data = [json.loads(line) for line in f]

with open('/home/user/llavafinetune/data/results_eval/vip_evaluated_gpt4.jsonl', 'r') as f:
    vip_data = [json.loads(line) for line in f]
    
with open('/home/user/llavafinetune/data/results_eval/qwen_evaluated_gpt4.jsonl', 'r') as f:
    qwen_data = [json.loads(line) for line in f]

with open('/home/user/llavafinetune/data/results_eval/instructblip_evaluated_gpt4.jsonl', 'r') as f:
    instructblip_data = [json.loads(line) for line in f]
    
with open('/home/user/llavafinetune/data/results_eval/minicpm_evaluated_gpt4.jsonl', 'r') as f:
    minicpm_data = [json.loads(line) for line in f]
        
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')


def chat_completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

def call_OpenAI_api(question, needs, response1, response2, image_path,model_="gpt-4-turbo",temp=0.7,max_gen_len=512):
    base64_image = encode_image(image_path)
    a = chat_completion_with_backoff(model=model_, \
        max_tokens=max_gen_len, temperature=temp, top_p=0.0, \
        logprobs=True, top_logprobs=10, messages=[
        {
        "role": "user",
        "content": [
            {"type": "text", "text": prompt.format(question, needs, response1, response2)},
            {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}",
            },
            },
        ],
        }
    ])
    return a["choices"][0]["message"]["content"], a["choices"][0]["logprobs"]["content"][0]["top_logprobs"]

# o, _ = call_OpenAI_api('Is the person wearing a hat?','2322322')
# print(o)

data_map = {
    'llava': llava_data,
    'vip': vip_data,
    'qwen': qwen_data,
    'instructblip': instructblip_data,
    'minicpm': minicpm_data
}

def judge(method):
    results = {
        'quark': 0,
        method: 0
    }
    for item in tqdm(quark_data):
        attempts = 0
        question = item['question']
        quark_final_response = item['final_response']
        human_needs = item['roundtwo']
        for item2 in data_map[method]:
            if item2['question'] == question:
                response2 = item2['answer']
                break
        while attempts <3:
            try:
                judge, _ = call_OpenAI_api(question, human_needs, quark_final_response, response2, item['img_path'])
                break
            except:
                print(f"Failed to generate questions for image {item['img_path']}.")
                attempts += 1          
        if judge == 'first':
            results['quark'] += 1
        else:
            results[method] += 1
    print(results)



# judge('llava')
judge('vip')
judge('qwen')
judge('instructblip')
judge('minicpm')
